About the data:

Series ID: GSE21933 Platform: Phalanx Human OneAray v5 GPL6254

Experiment type: Expression profiling by Array Details:

Phalanx Biotech Group’s Human OneArray v5 contains 32,048 features, 30968 detection probes and 1080 control probes, spotted onto glass slides using a proprietary non-contact printing method. Detection probes are annotated against the human genome and grouped into the following categories:

Lo FY, Chang JW, Chang IS, Chen YJ et al. The database of chromosome imbalance regions and genes resided in lung cancer from Asian and Caucasian identified by array-comparative genomic hybridization. BMC Cancer 2012 Jun 12;12:235. PMID: 22691236

Data and Normalization

data_id <- "GSE21933"

eset <-getGEO(data_id) #get dataset
## Found 1 file(s)
## GSE21933_series_matrix.txt.gz
gse <- eset[[1]] #get gene dataset

# head(pData(gse)) #phenotype data
# head(fData(gse)) #feature data
# head(exprs(gse)) #complete expression data set

summary(exprs(gse)) #statistical analysis of all summary()
##    GSM545615         GSM545616         GSM545617          GSM545618      
##  Min.   : 0.1536   Min.   : 0.4077   Min.   : 0.04938   Min.   : 0.5312  
##  1st Qu.: 3.8381   1st Qu.: 3.8453   1st Qu.: 3.84864   1st Qu.: 3.8799  
##  Median : 5.8976   Median : 5.8950   Median : 5.89495   Median : 5.8941  
##  Mean   : 6.2761   Mean   : 6.2757   Mean   : 6.27665   Mean   : 6.2752  
##  3rd Qu.: 8.4292   3rd Qu.: 8.4269   3rd Qu.: 8.42835   3rd Qu.: 8.4281  
##  Max.   :15.9989   Max.   :15.9989   Max.   :15.99895   Max.   :15.9989  
##    GSM545619        GSM545620         GSM545621         GSM545622       
##  Min.   : 0.283   Min.   : 0.3436   Min.   : 0.1726   Min.   : 0.01868  
##  1st Qu.: 3.819   1st Qu.: 3.8353   1st Qu.: 3.8568   1st Qu.: 3.84881  
##  Median : 5.894   Median : 5.8965   Median : 5.8949   Median : 5.89586  
##  Mean   : 6.277   Mean   : 6.2760   Mean   : 6.2766   Mean   : 6.27655  
##  3rd Qu.: 8.430   3rd Qu.: 8.4286   3rd Qu.: 8.4292   3rd Qu.: 8.42895  
##  Max.   :15.999   Max.   :15.9989   Max.   :15.9989   Max.   :15.99893  
##    GSM545623         GSM545624         GSM545625           GSM545626     
##  Min.   : 0.2089   Min.   : 0.2229   Min.   : 0.008928   Min.   : 0.039  
##  1st Qu.: 3.8365   1st Qu.: 3.8332   1st Qu.: 3.852787   1st Qu.: 3.845  
##  Median : 5.8907   Median : 5.8982   Median : 5.895908   Median : 5.896  
##  Mean   : 6.2757   Mean   : 6.2759   Mean   : 6.275721   Mean   : 6.276  
##  3rd Qu.: 8.4298   3rd Qu.: 8.4313   3rd Qu.: 8.429130   3rd Qu.: 8.428  
##  Max.   :15.9989   Max.   :15.9989   Max.   :15.998947   Max.   :15.999  
##    GSM545627         GSM545628          GSM545629         GSM545630      
##  Min.   : 0.0598   Min.   : 0.01137   Min.   : 0.2113   Min.   : 0.1679  
##  1st Qu.: 3.8114   1st Qu.: 3.82077   1st Qu.: 3.8270   1st Qu.: 3.8655  
##  Median : 5.9008   Median : 5.89057   Median : 5.8864   Median : 5.8968  
##  Mean   : 6.2769   Mean   : 6.27676   Mean   : 6.2747   Mean   : 6.2761  
##  3rd Qu.: 8.4302   3rd Qu.: 8.43115   3rd Qu.: 8.4314   3rd Qu.: 8.4304  
##  Max.   :15.9989   Max.   :15.99893   Max.   :15.9989   Max.   :15.9989  
##    GSM545631          GSM545632        GSM545633          GSM545634        
##  Min.   : 0.02635   Min.   : 0.260   Min.   : 0.05111   Min.   : 0.009472  
##  1st Qu.: 3.83963   1st Qu.: 3.867   1st Qu.: 3.85254   1st Qu.: 3.840950  
##  Median : 5.89913   Median : 5.900   Median : 5.89552   Median : 5.894485  
##  Mean   : 6.27604   Mean   : 6.275   Mean   : 6.27769   Mean   : 6.277425  
##  3rd Qu.: 8.42879   3rd Qu.: 8.428   3rd Qu.: 8.43166   3rd Qu.: 8.430225  
##  Max.   :15.99893   Max.   :15.999   Max.   :15.99890   Max.   :15.998947  
##    GSM545635        GSM545636        GSM545637         GSM545638      
##  Min.   : 0.000   Min.   : 0.523   Min.   : 0.3539   Min.   : 0.3566  
##  1st Qu.: 3.844   1st Qu.: 3.834   1st Qu.: 3.8428   1st Qu.: 3.8392  
##  Median : 5.894   Median : 5.889   Median : 5.9023   Median : 5.8975  
##  Mean   : 6.277   Mean   : 6.273   Mean   : 6.2740   Mean   : 6.2766  
##  3rd Qu.: 8.429   3rd Qu.: 8.427   3rd Qu.: 8.4290   3rd Qu.: 8.4297  
##  Max.   :15.999   Max.   :15.999   Max.   :15.9989   Max.   :15.9989  
##    GSM545639         GSM545640          GSM545641         GSM545642       
##  Min.   : 0.2665   Min.   : 0.08381   Min.   : 0.4029   Min.   : 0.05743  
##  1st Qu.: 3.8352   1st Qu.: 3.87108   1st Qu.: 3.8324   1st Qu.: 3.83935  
##  Median : 5.8931   Median : 5.89739   Median : 5.8930   Median : 5.89279  
##  Mean   : 6.2754   Mean   : 6.27513   Mean   : 6.2761   Mean   : 6.27641  
##  3rd Qu.: 8.4288   3rd Qu.: 8.43237   3rd Qu.: 8.4306   3rd Qu.: 8.43054  
##  Max.   :15.9989   Max.   :15.99895   Max.   :15.9989   Max.   :15.99893  
##    GSM545643         GSM545644         GSM545645          GSM545646      
##  Min.   : 0.3071   Min.   : 0.4294   Min.   : 0.03132   Min.   : 0.3196  
##  1st Qu.: 3.8600   1st Qu.: 3.8814   1st Qu.: 3.84646   1st Qu.: 3.8684  
##  Median : 5.8883   Median : 5.8968   Median : 5.89639   Median : 5.9005  
##  Mean   : 6.2759   Mean   : 6.2753   Mean   : 6.27705   Mean   : 6.2748  
##  3rd Qu.: 8.4299   3rd Qu.: 8.4290   3rd Qu.: 8.42751   3rd Qu.: 8.4301  
##  Max.   :15.9989   Max.   :15.9989   Max.   :15.99895   Max.   :15.9989  
##    GSM545647         GSM545648           GSM545649         GSM545650      
##  Min.   : 0.1531   Min.   : 0.008351   Min.   : 0.1819   Min.   : 0.3828  
##  1st Qu.: 3.8076   1st Qu.: 3.839472   1st Qu.: 3.8272   1st Qu.: 3.8240  
##  Median : 5.8911   Median : 5.896806   Median : 5.8958   Median : 5.8992  
##  Mean   : 6.2754   Mean   : 6.277289   Mean   : 6.2767   Mean   : 6.2751  
##  3rd Qu.: 8.4308   3rd Qu.: 8.430428   3rd Qu.: 8.4304   3rd Qu.: 8.4293  
##  Max.   :15.9989   Max.   :15.998947   Max.   :15.9989   Max.   :15.9989  
##    GSM545651         GSM545652          GSM545653         GSM545654       
##  Min.   : 0.1291   Min.   : 0.07773   Min.   : 0.3255   Min.   : 0.01947  
##  1st Qu.: 3.8631   1st Qu.: 3.84617   1st Qu.: 3.8466   1st Qu.: 3.85324  
##  Median : 5.8989   Median : 5.89233   Median : 5.8969   Median : 5.89586  
##  Mean   : 6.2770   Mean   : 6.27683   Mean   : 6.2754   Mean   : 6.27742  
##  3rd Qu.: 8.4321   3rd Qu.: 8.43029   3rd Qu.: 8.4304   3rd Qu.: 8.43093  
##  Max.   :15.9989   Max.   :15.99895   Max.   :15.9989   Max.   :15.99893  
##    GSM545655           GSM545656       
##  Min.   : 0.005867   Min.   : 0.04854  
##  1st Qu.: 3.838326   1st Qu.: 3.83994  
##  Median : 5.897306   Median : 5.89339  
##  Mean   : 6.276177   Mean   : 6.27702  
##  3rd Qu.: 8.429420   3rd Qu.: 8.42844  
##  Max.   :15.998928   Max.   :15.99893

The Phenotype Data contains 42 samples and 40 features for the samples. The Feature Data: The feature data contains 30967 probes and 10 features for each samples.

Looking at the summary of expressions, the values for each sample falls between 0-16. Suggesting that these values a log2 normalized. Further, the figure below shows a boxplot of all the gene count values for each sample. As seen below, all the samples have lined up unifromly suggesting that the data is already log normalized.

boxplot(exprs(gse), outline = FALSE, col = "gold")

Inspecting clinical Data

There are several columns within the metadata many of these are repeating. The most important of these columns are the column-1 - title, and columns 36 to 40 age:ch1,histology:ch1,sex:ch1,stage:ch1,tissue:ch1. Further, most of the column names have a “:ch1” at the end. This was removed. Lastly, the histology and stage columns have no data for negative samples. This was changed to “neg”.

#Preparing sample metadata
samplesinfo <- pData(gse) #gettign sample data
samplesinfo <- samplesinfo[,c(1,36:40)] #There are multiple columns for same data, acquiring relevant metadata information.
colnames(samplesinfo) <- gsub(":ch1","",colnames(samplesinfo))

samplesinfo <- samplesinfo%>%
  mutate(across(c(3,5),~replace_na(.,"Neg")))

samplesinfo <- samplesinfo%>%
  mutate_all(~gsub(" years","",.))%>%
  mutate(across(c(histology,sex,stage,tissue),factor))%>%
  mutate(age = as.numeric(age))%>%
  mutate(tissue = ifelse(tissue == "primary normal lung tissue","normal","tumor"))

samplesinfo$histology <- relevel(samplesinfo$histology, ref = "Neg")
samplesinfo$stage <- relevel(samplesinfo$stage, ref = "Neg")
samplesinfo$tissue <- relevel(as.factor(samplesinfo$tissue), ref = "normal")

#Prepare Features Metadata 
featuresinfo <- fData(gse)
#Prepare expression data
exprs_data <- exprs(gse)

The data set represents 42 lung tissue samples, 21 primary normal lung tissues and 21 primary lung tumor tissues. The tumor tissues are representing six stages of lung cancer IA,IB,IIB,IIIA,IIIB,IV. Further two hematological patterns are also represented in tumor samples, these are: adenocarcinoma (AD) and Squamous cell carcinoma (SQ).

samplesinfo%>%
  group_by(tissue, stage,histology)%>%
  tally()%>%
  spread(histology,n)

Clustering and PCA analysis

corMatrix <- cor(exprs_data, use = "c")
rownames(samplesinfo) <- colnames(corMatrix)
pheatmap(corMatrix, annotation_col =samplesinfo[,3:6], annotation_row = samplesinfo[,3:6], cluster_rows = T, cluster_cols = T)

#Hierarchical cluster based dimension analysis
htree <- hclust(dist(t(exprs_data)), method = "average")
plot(htree) #It seems there is a specific pattern of clustering within samples. And this might be biologically relevant difference so I think it is better to makes sure we check with PCA,the variance cotnribution. 

pca_gse <- prcomp(t(exprs_data)) # Run PCA analysis

screeplot(pca_gse, npcs=min(10,length(pca_gse$sdev)),type = c("barplot","lines")) #Screeplots of all the PC;s and their cotnribution. 

# PCA plot by tissue
p1 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=tissue, label = paste(tissue)))+
  geom_point(size=5)+
  geom_text_repel()
  

p2 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=stage, label = paste(stage)))+
  geom_point()+
  geom_text_repel()

p3 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=histology, label = paste(histology)))+
  geom_point()+
  geom_text_repel()

p4 <- cbind(samplesinfo, pca_gse$x)%>%
  ggplot(aes(x= PC1, y = PC2, col=title, label = paste(title)))+
  geom_point(size = 5)+
  geom_text_repel(size = 5)+
  ggsci::scale_fill_nejm()


ggarrange(p1,p2,p3,p4, ncol = 2, nrow = 2, common.legend = TRUE, legend = "bottom")

Filter dataset:

First I am filtering out only genes that belong to group-6 in the array as that represents the control genes. Lowly expressed genes must be filtered out as they can deviate the results of gene expression. The cut off for low expression is median gene expression of each sample. Next, I am applying three seperate strategies: 1) All samples have a data 2) At least 50% samples have all the data 3) At least 25% of sample have a data. However, this is a microarray dataset.

Exploring cut offs:

control_list <- rownames(gse@featureData[which(gse@featureData@data$GROUP ==6)]) #create a vector list of controls
control_matrix <- exprs_data[rownames(exprs_data)%in% control_list,] #isolate a matrix of just controls

mean_control <- colMeans(control_matrix)
mean_control <- data.frame(mean_control)%>%
  rownames_to_column(var = "control")
colnames(mean_control) <- c("controls","mean")

cat(paste0("Mean of Controls= ",mean(control_matrix),"\n","Mean of data= ",mean(exprs_data),"\n", "Median of Controls= ",median(control_matrix),"\n","Median of data= ",median(exprs_data)))
## Mean of Controls= 8.31688252769473
## Mean of data= 6.27604254788615
## Median of Controls= 7.96752265
## Median of data= 5.894992
cutoff <- median(exprs_data) #take median of expression dataset
is_expressed <- exprs_data>cutoff

keep_all<- rowSums(is_expressed) >=42 #stringent should be present in all samples.
keep_50 <- rowSums(is_expressed)>=21 #should be present in at leasst 50% samples.
keep <- rowSums(is_expressed)<10 # lenient threshold present in at least 25% samples

gse_filt <- rbind(table(keep),table(keep_50),table(keep_all))
rownames(gse_filt) <- c("25%","50%","100%")
gse_filt
##      FALSE  TRUE
## 25%  17877 13090
## 50%  15596 15371
## 100% 22070  8897
gse_25 <- gse[keep,]
gse_50 <- gse[keep_50,]
gse_100 <- gse[keep_all,]

dim_table <- data.frame(SampleCount = c("25%","50%","100%"),
                        features = c(dim(gse_25)[1],dim(gse_50)[1],dim(gse_100)[1]),
                        features = c(dim(gse_25)[2],dim(gse_50)[2],dim(gse_100)[2]))

print(dim_table)
##   SampleCount features features.1
## 1         25%    13090         42
## 2         50%    15371         42
## 3        100%     8897         42
plot(density(control_matrix), main = "Expression Density", xlab = "log2 intensity",)
abline(v = c(1,6), col = c("red","blue"), lty = 2)  # typical cutoff

Filter genes:

Differential Expression

Combined effect and interaction effect

set.seed(1234)
design_combined <- model.matrix(~0+tissue+ histology+stage, data = samplesinfo)
design_interaction <- model.matrix(~tissue*histology*stage, data = samplesinfo)

AD versus SQ versus neg

design_gse <- model.matrix(~0+samplesinfo$histology)
colnames(design_gse)<-c("neg","AD","SQ")

fit <- lmFit(exprs(gse_100),design_gse) #fit all the genes
contrasts_gse <- makeContrasts( ADvsneg = AD-neg, #AD versus negative
                                SQvsneg = SQ-neg, #SQ versus negative
                                ADvsSQ = AD-SQ, #AD versus SQ
                                SQvsAD = SQ-AD, #SQ versus AD
                                avsavg = (AD+SQ)/2-neg, #negative versus combined disease how significantly different SQ and AD are from negative.
                                ADvsSQtoneg = ((AD-neg)-(SQ-neg)), #Checks if AD is significantly more different than SQ
                                levels =design_gse)
cont_names <- colnames(contrasts_gse)

result_all <- data.frame()
for (i in cont_names){
  fit_contrast <- contrasts.fit(fit, contrasts_gse[,i])
  fit_contrast <- eBayes(fit_contrast)
  results <- topTable(fit_contrast, adjust="fdr", number=Inf)
  results$Genes <- rownames(results)
  results$Contrasts<- i
  result_all <- bind_rows(result_all,results)
  
  plot <-ggplot(results, aes(x = logFC, y = -log10(adj.P.Val))) +
    geom_point(aes(color = adj.P.Val < 0.05), alpha = 0.5) +
    scale_color_manual(values = c("black", "red")) +
    labs(title = paste("Volcano Plot:", i), x = "Log2 Fold Change", y = "-Log10 Adjusted P-value") +
    theme_minimal()
  print(plot)
  ggsave(filename = paste0(i,"_volcano.png"), path = "../figures")
  
}

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image

## Saving 7 x 5 in image
#volcano plot
ggplot(result_all, aes(x = logFC, y = -log10(adj.P.Val), color = Contrasts)) +
  geom_point(alpha = 0.5) +
  labs(title = "Volcano Plot for Multiple Contrasts", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value") +
  theme_minimal()

ggsave(filename = "volcano_all_contrasts.png",path = "../figures")
## Saving 7 x 5 in image

WGCNA

gsg <- goodSamplesGenes(t(exprs_data)) #explore data for WGCNA to ensure all genes and columns are good. 
##  Flagging genes and samples with too many missing values...
##   ..step 1
print(paste0("All samples are looking good: "))
## [1] "All samples are looking good: "
head(gsg$allOK) #IF TRUE all the values are OK. 
## [1] TRUE
head(gsg$goodGenes) #All genes are OK. 
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
head(gsg$goodSamples) # all samples are OK (42 of 42)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE

Chose Threshold

I chose the threshold to be at 15 as this had a mean connectivity of at scale free topology index of

softPower <- 10  

cat("The soft power is",softPower)
## The soft power is 10
adj_matrix <- exprs_data #filtered genes present in all samples
adj_matrix <- t(adj_matrix)
adjacency <- adjacency(adj_matrix, power = softPower)

# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency)
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM <- 1 - TOM

# Hierarchical clustering
geneTree <- hclust(as.dist(dissTOM), method = "average")
plot(geneTree, main = "Gene clustering")

# Module identification using dynamic tree cut
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
                             deepSplit = 2, pamRespectsDendro = FALSE,
                             minClusterSize = 30)
##  ..cutHeight not given, setting it to 0.999  ===>  99% of the (truncated) height range in dendro.
##  ..done.
cat("DYnamic tree with modules:")
## DYnamic tree with modules:
table(dynamicMods)
## dynamicMods
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 3232 3818 2114 1567 1556 1522 1483 1125 1072  963  905  810  773  694  451  435 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##  400  354  312  290  287  283  279  277  262  257  251  247  230  226  220  216 
##   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47 
##  213  212  201  187  183  182  179  154  148  146  144  141  139  138  131  125 
##   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62 
##  124  123  121  120  118  112  104   96   92   84   84   76   64   60   55
# Assign module colors
dynamicColors <- labels2colors(dynamicMods)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

cat("DYnamic tree modules with colors:")
## DYnamic tree modules with colors:
table(dynamicColors)
## dynamicColors
##   antiquewhite4         bisque4           black            blue           brown 
##              60             125            1125            2114            1567 
##          brown4          coral1          coral2            cyan       darkgreen 
##             131              64              55             451             279 
##        darkgrey     darkmagenta  darkolivegreen      darkorange     darkorange2 
##             262             201             212             251             138 
##         darkred   darkseagreen4   darkslateblue   darkturquoise     floralwhite 
##             283              76             124             277             139 
##           green     greenyellow            grey          grey60       honeydew1 
##            1522             810            3232             354              84 
##           ivory  lavenderblush3       lightcyan      lightcyan1      lightgreen 
##             141              84             400             144             312 
##      lightpink4 lightsteelblue1     lightyellow         magenta          maroon 
##              92             146             290             963              96 
##   mediumpurple3    midnightblue    navajowhite2          orange      orangered4 
##             148             435             104             257             154 
##   paleturquoise  palevioletred3            pink           plum1           plum2 
##             216             112            1072             179             123 
##          purple             red       royalblue     saddlebrown          salmon 
##             905            1483             287             226             694 
##         salmon4         sienna3         skyblue        skyblue3       steelblue 
##             118             187             230             182             220 
##             tan        thistle1        thistle2       turquoise          violet 
##             773             120             121            3818             213 
##           white          yellow     yellowgreen 
##             247            1556             183
# STEP-4: Relate Modules to Traits
# Calculate eigengenes
MEs <- moduleEigengenes(adj_matrix, colors = dynamicColors)$eigengenes
MEs <- orderMEs(MEs)

# Correlate eigengenes with traits
moduleTraitCor <- cor(MEs, samplesinfo, use = "p")
## Warning in storage.mode(y) <- "double": NAs introduced by coercion
moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nSamples = ncol(adj_matrix))

# Plot correlation heatmap
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(samplesinfo),
               yLabels = names(MEs),
               colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                                  signif(moduleTraitPvalue, 1), ")", sep = ""),
               main = "Module-Trait Relationships")

# STEP-5: GET Hub genes from significant moduels
# Choose a module of interest (e.g. "blue")
module <- "blue"
module_genes <- names(adj_matrix)[which(dynamicColors == module)]

# Calculate intra-modular connectivity (hubness)
kWithin <- intramodularConnectivity(adjacency, dynamicColors)

# Order genes by connectivity
hub_genes <- module_genes[order(kWithin[module_genes, "kWithin"], decreasing = TRUE)]
head(hub_genes, 10)  # Top 10 hub genes
## NULL

# tutorial: https://sbc.shef.ac.uk/geo_tutorial/tutorial.nb.html